Data currently is just tables provided by Zeisel et al, with some attempt to clean up covariates.
data(data, package='Zeisel2015Data')
zeisel = FromMatrix(data$expr, data$cdat, data$fdat)
zeiself = zeisel[freq(zeisel)>.1]
#expressed endogenous genes
zeiself = zeiself[!mcols(zeiself)$primerid %like% 'ERCC-',]
rm(data)
blah = gc()
Load the data, which are log2+1 transformed UMIs, as well as gene and cell covariates.
erccz = zeisel[mcols(zeisel)$primerid %like% 'ERCC']
erccdt = as(erccz, 'data.table')
# "CDR" of the ERCC
erccdt[,cdr_ercc_exclude := cdr_ercc - 1*(value>0)] #exclude current probe
erccdt[, primerid:=factor(primerid)]
erccdt[, wellKey:=factor(wellKey)]
erccdt[, pgex_batch:=factor(pgex_batch)]
Get the ercc data and calculate some features on it.
datatable(head(erccdt))
Using the manufacturers’ values for the relative concentrations of the spike-in mix, and sequences I pre-calculated some of the features including attomoles_per_ul (concentration), AfreqCenter, proportion of adenosine content, minus .25, (etc), the length of the transcript transcript_len_bp. These features were calculated in the data package, see the source under https://github.com/amcdavid/Zeisel2015Data/blob/master/data-raw/datasets.R for details.
The Zeisel data also provided information on the mRNA molecules (total UMIs) mRNA.molecules, the size of the cell pgex_cell_size_um. I inferred a batch ID from the fluidigm run ID pgex_batch and calculated the CDR of the exogenous ERCC cdr_ercc and CDR of endogenous transcript cdr_endo.
The spike-ins might help answer three questions:
Let’s investigate with some GLMs, first. We are fitting models of the form \[ E(Y) = \beta \text{ Conc. ERCC} \]
# Only using the concentration, log-log
unlog = function(x) round(2^x - 1)
lineareffect = lm(unlog(value) ~ attomoles_per_ul, data=erccdt)
loglog = lm(value ~ log2(attomoles_per_ul+1), data=erccdt)
qp_log = glm(unlog(value) ~ attomoles_per_ul, data=erccdt, family='quasipoisson')
qp = glm(unlog(value) ~ attomoles_per_ul, data=erccdt, family=quasipoisson(link='identity'), mustart=predict(qp_log, response='response'))
nb = glm.nb(unlog(value) ~ attomoles_per_ul, data=erccdt, link='identity', mustart=predict(qp_log, response='response'))
model_summary = function(fit){
t_fit = augment(fit, type.residuals='pearson', type.predict='response')
pid = erccdt$primerid
t_fit = t_fit %>% mutate(pearson.resid = resid(fit, type='pearson'), primerid = pid)
cl = match.call()
t_fit = t_fit %>% mutate(f_group = cut(log(.fitted), 10), nonzero=t_fit[,1]>0) %>% group_by(f_group) %>% mutate(f_group_mean = mean(.fitted), mean_resid=mean(pearson.resid))
## Incorrect dispatch on `bam` objects
if('gam' %in% class(fit)){
g_fit = broom:::glance_mcgv(fit)
g_fit$r.squared = summary(fit)$r.sq
} else{
g_fit = glance(fit)
}
if(is.null(g_fit$r.squared)) g_fit$r.squared = 1-g_fit$deviance/g_fit$null.deviance
resid_plot = ggplot(t_fit, aes(y=pearson.resid)) + sprintf('Model=%s, R2 = %1.3f, AIC = %e', deparse(cl[[2]]), g_fit$r.squared, g_fit$AIC) %>% ggtitle() + xlab("Fitted values (binned)")
print(resid_plot +aes(x=f_group)+ geom_boxplot(varwidth=TRUE))
primer_resid = t_fit %>% group_by(primerid) %>% summarise(q25 = quantile(pearson.resid, .25), q75=quantile(pearson.resid, .75)) %>% mutate(primerid = forcats::fct_reorder(primerid, q25))
print(ggplot(primer_resid, aes(x=primerid, ymin=q25, ymax=q75)) + geom_linerange() + ylab('IQR(Pearson Residuals)') +coord_flip())
invisible(list(t_fit, primer_resid))
}
model_summary(lineareffect)
model_summary(loglog)
model_summary(qp)
model_summary(nb)
In concentration-only model, the linear model:
# Giant GAM, use 'cr' basis for sake of speed
theta=c(nb$theta/3, nb$theta*3)
linkv = 'log' #could also try sqrt, hard to diagnose which to use
link = log
estimate_theta = gam(unlog(value) ~ s(link(attomoles_per_ul), bs='cr', k=13) + s(primerid, bs='re'), data=erccdt, family=negbin(link=linkv, theta=theta), mustart=predict(nb, type='response'), optimizer='perf', scale=-1)
With the GAMs, we don’t need to the link function in order to model the linear-ish relationship between attomoles_per_ul and the values, since the splines can handle whatever functional form is present. Instead, the link function controls the additivity. Identity=additive effects, log=multiplicative, sqrt = power.
gam_gc2 = gam(update(estimate_theta$formula, .~. + s(I(GfreqCenter+CfreqCenter), bs='cr', k=12) + s(AfreqCenter, bs='cr', k=12) + s(transcript_len_bp, bs='cr', k=12)),
data=erccdt, optimizer='perf', scale=1.6,
family=negbin(link=linkv, theta=estimate_theta$family$Theta),
mustart=predict(estimate_theta, type='response'))
pearson_scale = function(fit) sum(resid(fit, type='pearson')^2)/fit$df.residual
plot(gam_gc2, scheme=1, scale=0)
I(GfreqCenter+CfreqCenter) is the GC-frequency minus .5.
pearson_scale(gam_gc2)
## [1] 1.000119
We can’t easily estimate the negative binomial dispersion, but we check that the Pearson scale estimate is close to unity.
ms = model_summary(gam_gc2)
Now we add GC content, A content, length of the ERCC in base pairs and a random effect. There is little evidence of a mean relationship between these
```
primer_disp = ms[[2]] %>% left_join(as.data.frame(rowData(erccz)))
disp_plot = ggplot(primer_disp, aes(x=CfreqCenter + GfreqCenter, ymin=q25, ymax=q75))+geom_linerange() + ylab('IQR(Pearson Residuals)') + ggtitle("Dispersion vs ERCC features") + geom_point(aes(y=q75-q25, color='range')) + geom_smooth(aes(y=q75-q25, color='range'))
disp_plot
Let’s look at some other ways to correlated the model dispersion to ERCC features. Here’s the GC frequency.
disp_plot + aes(jitter(attomoles_per_ul, amount=min(attomoles_per_ul)*2)) + scale_x_log10()
disp_plot + aes(x=rank(attomoles_per_ul, ties='random'))
Concentration
disp_plot + aes(x=rank(transcript_len_bp, ties='random'))
Length
Finally we add a random effect for the ERCC-id. This smooths the length and GC relationships.
In conclusion, the linear model fits a negative binomial pretty well. There is evidence of GC and length bias, but the ERCC-specific random effect is much more substantial and distinctly non-normal looking.
Theta is around 5 so the model between a geometric and a Poisson, \(\sigma^2 \approx \mu +\mu^2/5\).
What we really care about is the relationship between the ERCC and endogenous transcript. A first step in modelling this is to examine the relationship between cell-level covariates and counts.
# + s(transcript_len_bp, bs='cr') + s(log2(mRNA.molecules), bs='cr') + s(cdr_ercc_exclude,bs='cr') + s(primerid, bs='re') + s(pgex_batch, bs='re')
# + s(pgex_cell_size_um, bs='cr') + s(mRNA.molecules, bs='cr')
gam_cell = bam(unlog(value)~ s(log(attomoles_per_ul), bs='cr') + s(I(GfreqCenter + CfreqCenter)) + AfreqCenter + I(transcript_len_bp/1000) + s(primerid, bs='re') +poly(cdr_ercc_exclude,2) + poly(cdr_endo,2), data=erccdt, family=negbin(link='log', theta=estimate_theta$family$Theta*2.5), discrete=TRUE)
summary(gam_cell)
##
## Family: Negative Binomial(11.117)
## Link function: log
##
## Formula:
## unlog(value) ~ s(log(attomoles_per_ul), bs = "cr") + s(I(GfreqCenter +
## CfreqCenter)) + AfreqCenter + I(transcript_len_bp/1000) +
## s(primerid, bs = "re") + poly(cdr_ercc_exclude, 2) + poly(cdr_endo,
## 2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8186 0.2043 18.694 < 2e-16 ***
## AfreqCenter -3.3646 2.3485 -1.433 0.152
## I(transcript_len_bp/1000) -0.7829 0.1634 -4.791 1.66e-06 ***
## poly(cdr_ercc_exclude, 2)1 144.2053 0.5756 250.540 < 2e-16 ***
## poly(cdr_ercc_exclude, 2)2 -27.1991 0.5698 -47.734 < 2e-16 ***
## poly(cdr_endo, 2)1 -13.0204 0.5404 -24.092 < 2e-16 ***
## poly(cdr_endo, 2)2 -2.3839 0.5410 -4.407 1.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(log(attomoles_per_ul)) 1.651 1.653 1743.906 <2e-16 ***
## s(I(GfreqCenter + CfreqCenter)) 1.002 1.002 9.146 0.0025 **
## s(primerid) 51.153 54.000 555.686 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.897 Deviance explained = 97.6%
## fREML = 2.4643e+05 Scale est. = 1.1584 n = 171285
pearson_scale(gam_cell)
## [1] 1.139404
plot(gam_cell, scale=0, scheme=1)
Add cellular effects–the model hates anything except a log link, which has a dramatic impact on the significance of the CG estimates, for example…
gam_cell2 = update(gam_cell, . ~ s(log(attomoles_per_ul)) + s(pgex_batch, bs='re') + s(primerid, bs='re') + cdr_endo +I(transcript_len_bp/1000)+ s(cdr_ercc_exclude, bs='cr') + s(I(CfreqCenter + GfreqCenter), bs='cr'), family = negbin(theta = estimate_theta$family$Theta*2.5, link='log'))
plot(gam_cell2, scale=0, scheme=1)
pearson_scale(gam_cell2)
## [1] 0.9728619
ms = model_summary(gam_cell2)
primer_disp = ms[[2]] %>% left_join(as.data.frame(rowData(erccz)))
disp_plot = ggplot(primer_disp, aes(x=CfreqCenter + GfreqCenter, ymin=q25, ymax=q75))+geom_linerange() + ylab('IQR(Pearson Residuals)') + ggtitle("Dispersion vs ERCC features") + geom_point(aes(y=q75-q25, color='range')) + geom_smooth(aes(y=q75-q25, color='range'))
disp_plot
disp_plot + aes(jitter(attomoles_per_ul, amount=min(attomoles_per_ul)*2)) + scale_x_log10()
disp_plot + aes(x=rank(attomoles_per_ul, ties='random'))
And actually, we can directly model the variance function with GAMs (at least in a two-step procedure). We just model the log-squared pearson residuals as a additive, smooth function of covariates (corresponding to a log-normal model).
erccdt[,resid:=resid(gam_cell2, type='pearson')]
gam_var = update(gam_cell2, log(resid^2) ~ . - cdr_endo - I(transcript_len_bp/1000) + s(transcript_len_bp, bs='cr') + s(cdr_endo, bs='cr'), family=gaussian())
hist(erccdt[,resid])
plot(gam_var, scale=0, scheme=1)